{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Quantize Mixtral-8x7B so it can run in 24GB GPU\n",
    "from awq import AutoAWQForCausalLM\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "model_path = 'mistralai/Mixtral-8x7B-Instruct-v0.1'\n",
    "quant_path = 'mixtral-instruct-awq'\n",
    "modules_to_not_convert = [\"gate\"]\n",
    "quant_config = {\n",
    "    \"zero_point\": True, \"q_group_size\": 128, \"w_bit\": 4, \"version\": \"GEMM\",\n",
    "    \"modules_to_not_convert\": modules_to_not_convert\n",
    "}\n",
    "\n",
    "# Load model\n",
    "# NOTE: pass safetensors=True to load safetensors\n",
    "model = AutoAWQForCausalLM.from_pretrained(\n",
    "    model_path, safetensors=True, **{\"low_cpu_mem_usage\": True}\n",
    ")\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n",
    "\n",
    "# Quantize\n",
    "model.quantize(\n",
    "    tokenizer,\n",
    "    quant_config=quant_config,\n",
    "    modules_to_not_convert=modules_to_not_convert\n",
    ")\n",
    "\n",
    "# Save quantized model\n",
    "model.save_quantized(quant_path)\n",
    "tokenizer.save_pretrained(quant_path)\n",
    "\n",
    "print(f'Model is quantized and saved at \"{quant_path}\"')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Quantize Mixtral-8x7B so it can run in 24GB GPU"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
